An Efficient Crop Recommendation using Machine Learning Techniques
R Markdown
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#Importing requied libraries
library(readr)
library(tidyverse)
library(tidymodels)
library(ggplot2)
library(dplyr)
library(caret)
library(e1071)
library(rpart)
crop <- read.csv2("Cropdata.csv", header = TRUE, sep = ",")
View(crop)str(crop)## 'data.frame': 902 obs. of 7 variables:
## $ Time.line: Factor w/ 5 levels "2014-2015","2015-2016",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ PH : num 7.86 7.71 8.01 7.83 8.11 7.53 8.11 7.3 7.69 7.53 ...
## $ EC : num 0.931 0.694 1.11 1.09 1.14 1.02 1.14 1.02 0.921 1.02 ...
## $ N : num 168 100 91 100 82 ...
## $ P : num 17 19.9 16 19.9 17 19.9 17 19.9 17 19.9 ...
## $ k : num 32 43.6 99 43.6 102 43.6 1.2 43.6 36 43.6 ...
## $ Total : num 226 172 215 173 210 ...
head(crop)## Time.line PH EC N P k Total
## 1 2014-2015 7.86 0.931 168.0 17.0 32.0 225.791
## 2 2014-2015 7.71 0.694 100.1 19.9 43.6 172.004
## 3 2014-2015 8.01 1.110 91.0 16.0 99.0 215.120
## 4 2014-2015 7.83 1.090 100.1 19.9 43.6 172.520
## 5 2014-2015 8.11 1.140 82.0 17.0 102.0 210.250
## 6 2014-2015 7.53 1.020 100.1 19.9 43.6 172.150
summary(crop)## Time.line PH EC N
## 2014-2015:219 Min. : 0.36 Min. : 0.0090 Min. : 1.81
## 2015-2016:154 1st Qu.: 7.72 1st Qu.: 0.8152 1st Qu.: 100.10
## 2017-2018:224 Median : 7.96 Median : 1.0050 Median : 152.00
## 2018-2019:114 Mean : 18.70 Mean : 7.2213 Mean : 139.50
## 2019-2020:191 3rd Qu.: 8.10 3rd Qu.: 1.0900 3rd Qu.: 169.00
## Max. :7388.00 Max. :952.0000 Max. :1725.00
## P k Total
## Min. : 0.13 Min. : 0.20 Min. : 51.9
## 1st Qu.: 14.00 1st Qu.: 42.00 1st Qu.: 172.7
## Median : 17.00 Median : 43.60 Median : 224.0
## Mean : 26.37 Mean : 57.85 Mean : 249.4
## 3rd Qu.: 19.90 3rd Qu.: 81.00 3rd Qu.: 282.4
## Max. :1282.00 Max. :641.00 Max. :7552.6
crop$Total <- round(crop$Total,0)
#**************************************************Step_1*******************************************
#The first step is create two new columns as follows:
# Categories in grade coloumn- Converting grades into low or high risk
crop_new <- mutate(crop,
Crop_Type = case_when(Total %in% 1:200 ~ "Ground Nut",
Total %in% 200:214 ~ "Sugar Cane",
Total %in% 215:235 ~ "Grape",
Total %in% 236:244 ~ "Onion",
Total %in% 245:250 ~ "Banana",
Total %in% 251:100000 ~ "Turmeric"))
#Creating a csv file
write.table(crop_new, file = "crop_new.csv",
sep = ",",
row.names = FALSE)
View(crop_new)Data Preparation
sample_set <- sample(2, nrow(crop_new),
replace = TRUE,
prob = c(0.7, 0.3))
train <- crop_new[sample_set==1,]
head(train)## Time.line PH EC N P k Total Crop_Type
## 1 2014-2015 7.86 0.931 168.0 17.0 32.0 226 Grape
## 2 2014-2015 7.71 0.694 100.1 19.9 43.6 172 Ground Nut
## 3 2014-2015 8.01 1.110 91.0 16.0 99.0 215 Grape
## 8 2014-2015 7.30 1.020 100.1 19.9 43.6 172 Ground Nut
## 10 2014-2015 7.53 1.020 100.1 19.9 43.6 172 Ground Nut
## 11 2014-2015 8.06 1.040 83.0 16.0 94.0 202 Sugar Cane
#Creating a csv file
write.table(train, file = "crop_train.csv",
sep = ",",
row.names = FALSE)
test <- crop_new[sample_set==2,]
head(test)## Time.line PH EC N P k Total Crop_Type
## 4 2014-2015 7.83 1.090 100.1 19.9 43.6 173 Ground Nut
## 5 2014-2015 8.11 1.140 82.0 17.0 102.0 210 Sugar Cane
## 6 2014-2015 7.53 1.020 100.1 19.9 43.6 172 Ground Nut
## 7 2014-2015 8.11 1.140 82.0 17.0 1.2 109 Ground Nut
## 9 2014-2015 7.69 0.921 76.0 17.0 36.0 138 Ground Nut
## 14 2014-2015 8.12 1.400 81.0 17.0 102.0 210 Sugar Cane
#Creating a csv file
write.table(crop_new, file = "test.csv",
sep = ",",
row.names = FALSE)Data Cleaning
library(DataExplorer)
sum(is.na(train))## [1] 0
sum(is.na(test))## [1] 0
plot_missing(train)
# Exploratory Data Analysis (EDA) * describe - can computes the
statistics of all numerical variables
library(Hmisc)## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:e1071':
##
## impute
## The following object is masked from 'package:parsnip':
##
## translate
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(train)## train
##
## 8 Variables 644 Observations
## --------------------------------------------------------------------------------
## Time.line
## n missing distinct
## 644 0 5
##
## lowest : 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
## highest: 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
##
## Value 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
## Frequency 147 111 162 87 137
## Proportion 0.228 0.172 0.252 0.135 0.213
## --------------------------------------------------------------------------------
## PH
## n missing distinct Info Mean Gmd .05 .10
## 644 0 163 0.999 9.213 3.166 7.301 7.546
## .25 .50 .75 .90 .95
## 7.720 7.960 8.100 8.210 8.680
##
## lowest : 0.81000 1.12000 1.15000 2.06000 2.72000
## highest: 10.31000 11.26000 18.60000 18.70026 822.00000
##
## Value 0 10 20 820
## Frequency 6 633 4 1
## Proportion 0.009 0.983 0.006 0.002
##
## For the frequency table, variable is rounded to the nearest 10
## --------------------------------------------------------------------------------
## EC
## n missing distinct Info Mean Gmd .05 .10
## 644 0 229 0.999 6.099 10.68 0.1415 0.3130
## .25 .50 .75 .90 .95
## 0.8108 0.9970 1.0800 1.2900 2.1055
##
## lowest : 0.009 0.040 0.050 0.060 0.070
## highest: 151.000 168.000 691.000 921.000 951.000
##
## Value 0 10 20 80 120 150 170 690 920 950
## Frequency 623 12 1 1 1 2 1 1 1 1
## Proportion 0.967 0.019 0.002 0.002 0.002 0.003 0.002 0.002 0.002 0.002
##
## For the frequency table, variable is rounded to the nearest 10
## --------------------------------------------------------------------------------
## N
## n missing distinct Info Mean Gmd .05 .10
## 644 0 77 0.942 138.5 40.58 100.1 100.1
## .25 .50 .75 .90 .95
## 100.1 152.0 169.0 181.0 189.0
##
## lowest : 1.81 7.87 8.00 15.00 17.00, highest: 195.00 196.00 197.00 198.00 199.00
## --------------------------------------------------------------------------------
## P
## n missing distinct Info Mean Gmd .05 .10
## 644 0 39 0.963 27.36 25.07 9.0 11.0
## .25 .50 .75 .90 .95
## 14.0 17.0 19.9 19.9 151.6
##
## lowest : 5 7 8 9 10, highest: 171 172 173 178 1282
## --------------------------------------------------------------------------------
## k
## n missing distinct Info Mean Gmd .05 .10
## 644 0 99 0.966 58.13 32.09 14.15 28.00
## .25 .50 .75 .90 .95
## 42.00 43.60 81.00 95.00 99.00
##
## lowest : 0.20 0.87 1.00 4.00 6.00, highest: 130.00 144.00 146.00 160.00 641.00
## --------------------------------------------------------------------------------
## Total
## n missing distinct Info Mean Gmd .05 .10
## 644 0 154 0.993 239 74.85 172 172
## .25 .50 .75 .90 .95
## 173 226 284 297 309
##
## lowest : 52 78 106 113 116, highest: 908 1079 1122 1152 1397
## --------------------------------------------------------------------------------
## Crop_Type
## n missing distinct
## 644 0 6
##
## lowest : Banana Grape Ground Nut Onion Sugar Cane
## highest: Grape Ground Nut Onion Sugar Cane Turmeric
##
## Value Banana Grape Ground Nut Onion Sugar Cane Turmeric
## Frequency 14 74 232 17 42 265
## Proportion 0.022 0.115 0.360 0.026 0.065 0.411
## --------------------------------------------------------------------------------
describe(test)## test
##
## 8 Variables 258 Observations
## --------------------------------------------------------------------------------
## Time.line
## n missing distinct
## 258 0 5
##
## lowest : 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
## highest: 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
##
## Value 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
## Frequency 72 43 62 27 54
## Proportion 0.279 0.167 0.240 0.105 0.209
## --------------------------------------------------------------------------------
## PH
## n missing distinct Info Mean Gmd .05 .10
## 258 0 90 0.999 42.38 69.4 7.319 7.630
## .25 .50 .75 .90 .95
## 7.740 7.950 8.100 8.190 8.610
##
## lowest : 0.36 0.46 3.83 6.00 6.03
## highest: 10.57 18.60 765.00 768.00 7388.00
##
## Value 0 20 760 7380
## Frequency 253 2 2 1
## Proportion 0.981 0.008 0.008 0.004
##
## For the frequency table, variable is rounded to the nearest 20
## --------------------------------------------------------------------------------
## EC
## n missing distinct Info Mean Gmd .05 .10
## 258 0 118 0.999 10.02 18.37 0.2332 0.4650
## .25 .50 .75 .90 .95
## 0.8310 1.0100 1.0900 1.2800 1.7100
##
## lowest : 0.07 0.08 0.09 0.12 0.14, highest: 101.00 124.00 248.00 898.00 952.00
##
## Value 0 10 100 120 250 900 950
## Frequency 250 3 1 1 1 1 1
## Proportion 0.969 0.012 0.004 0.004 0.004 0.004 0.004
##
## For the frequency table, variable is rounded to the nearest 10
## --------------------------------------------------------------------------------
## N
## n missing distinct Info Mean Gmd .05 .10
## 258 0 61 0.94 142 52.89 99.94 100.10
## .25 .50 .75 .90 .95
## 100.10 148.00 168.75 178.00 186.15
##
## lowest : 15.4 19.0 76.0 81.0 82.0, highest: 196.0 198.0 199.0 275.0 1725.0
## --------------------------------------------------------------------------------
## P
## n missing distinct Info Mean Gmd .05 .10
## 258 0 28 0.957 23.91 18.58 9.0 11.0
## .25 .50 .75 .90 .95
## 14.0 17.0 19.9 19.9 131.7
##
## lowest : 0.13 1.00 5.00 7.00 8.00, highest: 165.00 168.00 171.00 172.00 178.00
## --------------------------------------------------------------------------------
## k
## n missing distinct Info Mean Gmd .05 .10
## 258 0 64 0.961 57.13 30.83 17.0 29.0
## .25 .50 .75 .90 .95
## 41.0 43.6 78.0 96.0 105.7
##
## lowest : 1.2 12.0 13.0 14.0 17.0, highest: 122.0 128.0 146.0 175.0 196.0
## --------------------------------------------------------------------------------
## Total
## n missing distinct Info Mean Gmd .05 .10
## 258 0 108 0.991 275.4 155.5 172.0 172.0
## .25 .50 .75 .90 .95
## 173.0 221.0 279.8 298.6 313.4
##
## lowest : 98 106 109 138 142, highest: 1045 1100 1112 1857 7553
##
## Value 100 200 300 400 500 600 900 1000 1100 1900 7600
## Frequency 6 155 86 2 1 2 1 1 2 1 1
## Proportion 0.023 0.601 0.333 0.008 0.004 0.008 0.004 0.004 0.008 0.004 0.004
##
## For the frequency table, variable is rounded to the nearest 100
## --------------------------------------------------------------------------------
## Crop_Type
## n missing distinct
## 258 0 6
##
## lowest : Banana Grape Ground Nut Onion Sugar Cane
## highest: Grape Ground Nut Onion Sugar Cane Turmeric
##
## Value Banana Grape Ground Nut Onion Sugar Cane Turmeric
## Frequency 3 29 99 9 21 97
## Proportion 0.012 0.112 0.384 0.035 0.081 0.376
## --------------------------------------------------------------------------------
- Two continuous variables
- Taking PH & EC
library(ggplot2)
q <- ggplot(data = train, aes(x =Time.line , y = log(PH) ))+
geom_line(colour = "darkgreen") +
geom_point(aes(colour = factor(Crop_Type)), size =3) +
geom_point(colour = "grey90", size = 1.5)+
labs(title = 'Crop according to PH for Time.line 2015-2020',
y='PH of the soil',x='Time.line')
qlibrary(plotly)
fig <- train %>%
plot_ly(
x = ~log(PH),
y = ~log(P),
size = ~k,
color = ~Crop_Type,
frame = ~Time.line,
text = ~P,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
)
fig <- fig %>% layout(
xaxis = list(
type = "log"
)
)
figplot_ly(train, x = ~log(PH), y = ~Crop_Type ,
type = 'scatter',
mode = 'markers',
marker = list(color = "darkgreen" ), opacity = 0.5) %>%
layout(title = 'Crop according to PH for Time.line 2015-2020',
yaxis = list(title = 'Time.line'),
xaxis = list(title = 'PH of the soil ') )Boosting Algorithms
train$Crop_Type <- as.factor(train$Crop_Type)
library(mlbench)
library(caret)
# Example of Boosting Algorithms
control <- trainControl(method="repeatedcv", number=10, repeats=3)
seed <- 7
metric <- "Accuracy"Modelling
SvmRadial
set.seed(seed)
fit.svmRadial <- train(Crop_Type~., data=train, method="svmRadial", metric=metric, trControl=control)
fit.svmRadial## Support Vector Machines with Radial Basis Function Kernel
##
## 644 samples
## 7 predictor
## 6 classes: 'Banana', 'Grape', 'Ground Nut', 'Onion', 'Sugar Cane', 'Turmeric'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 580, 578, 579, 580, 578, 580, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.8330574 0.7470879
## 0.50 0.8438765 0.7661621
## 1.00 0.8526598 0.7812337
##
## Tuning parameter 'sigma' was held constant at a value of 0.195314
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.195314 and C = 1.
Stochastic Gradient Boosting
# Stochastic Gradient Boosting
set.seed(seed)
fit.gbm <- train(Crop_Type~., data=train, method="gbm", metric=metric, trControl=control, verbose=FALSE)
fit.gbm## Stochastic Gradient Boosting
##
## 644 samples
## 7 predictor
## 6 classes: 'Banana', 'Grape', 'Ground Nut', 'Onion', 'Sugar Cane', 'Turmeric'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 580, 578, 579, 580, 578, 580, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.9989663 0.9984885
## 1 100 0.9989581 0.9984760
## 1 150 0.9994792 0.9992400
## 2 50 0.9989663 0.9984900
## 2 100 0.9968737 0.9954233
## 2 150 0.9958481 0.9939448
## 3 50 0.9989501 0.9984689
## 3 100 0.9958406 0.9939460
## 3 150 0.9942771 0.9916607
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
## 1, shrinkage = 0.1 and n.minobsinnode = 10.
kNN
# kNN
set.seed(seed)
fit.knn <- train(Crop_Type~., data=train, method="knn", metric=metric, preProc=c("center", "scale"), trControl=control)
fit.knn## k-Nearest Neighbors
##
## 644 samples
## 7 predictor
## 6 classes: 'Banana', 'Grape', 'Ground Nut', 'Onion', 'Sugar Cane', 'Turmeric'
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 580, 578, 579, 580, 578, 580, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8980746 0.8506664
## 7 0.8914000 0.8408056
## 9 0.8903744 0.8385319
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
Model Selection
summarize results
# summarize results
boosting_results <- resamples(list(svmRadial=fit.svmRadial, gbm=fit.gbm, knn =fit.knn))
boosting_results##
## Call:
## resamples.default(x = list(svmRadial = fit.svmRadial, gbm = fit.gbm, knn
## = fit.knn))
##
## Models: svmRadial, gbm, knn
## Number of resamples: 30
## Performance metrics: Accuracy, Kappa
## Time estimates for: everything, final model fit
summary(boosting_results)##
## Call:
## summary.resamples(object = boosting_results)
##
## Models: svmRadial, gbm, knn
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## svmRadial 0.800000 0.8437500 0.8582589 0.8526598 0.8691349 0.9062500 0
## gbm 0.984375 1.0000000 1.0000000 0.9994792 1.0000000 1.0000000 0
## knn 0.812500 0.8879788 0.9062500 0.8980746 0.9215650 0.9384615 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## svmRadial 0.7066991 0.7696174 0.7843270 0.7812337 0.8013855 0.8601093 0
## gbm 0.9771999 1.0000000 1.0000000 0.9992400 1.0000000 1.0000000 0
## knn 0.7302424 0.8334432 0.8606617 0.8506664 0.8824172 0.9113838 0
dotplot(boosting_results)
# Bagging Algorithms ## Random Forest
# Random Forest
set.seed(seed)
fit.rf <- train(Crop_Type~., data=train, method="rf", metric=metric, trControl=control)
fit.rf## Random Forest
##
## 644 samples
## 7 predictor
## 6 classes: 'Banana', 'Grape', 'Ground Nut', 'Onion', 'Sugar Cane', 'Turmeric'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 580, 578, 579, 580, 578, 580, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9600648 0.9411590
## 6 0.9912337 0.9870981
## 10 0.9958654 0.9939269
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.
# # Example of Bagging algorithms
# control <- trainControl(method="repeatedcv", number=10, repeats=3)
# seed <- 7
# metric <- "Accuracy"
# # Bagged CART
# set.seed(seed)
# fit.treebag <- train(Class~., data=dataset, method="treebag", metric=metric, trControl=control)
# # Random Forest
# set.seed(seed)
# fit.rf <- train(Class~., data=dataset, method="rf", metric=metric, trControl=control)
# # summarize results
# bagging_results <- resamples(list(treebag=fit.treebag, rf=fit.rf))
# summary(bagging_results)
# dotplot(bagging_results)library(randomForest)## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
boston_bag <- randomForest(Crop_Type ~ ., data = train, mtry = 13,
importance = TRUE, ntrees = 500)## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
boston_bag##
## Call:
## randomForest(formula = Crop_Type ~ ., data = train, mtry = 13, importance = TRUE, ntrees = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 0.31%
## Confusion matrix:
## Banana Grape Ground Nut Onion Sugar Cane Turmeric class.error
## Banana 14 0 0 0 0 0 0.00000000
## Grape 0 74 0 0 0 0 0.00000000
## Ground Nut 0 0 232 0 0 0 0.00000000
## Onion 0 1 0 16 0 0 0.05882353
## Sugar Cane 0 0 1 0 41 0 0.02380952
## Turmeric 0 0 0 0 0 265 0.00000000
boston_bag <- randomForest(Crop_Type ~ ., data = train, mtry = 13,
importance = TRUE, ntrees = 500)## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
boston_bag##
## Call:
## randomForest(formula = Crop_Type ~ ., data = train, mtry = 13, importance = TRUE, ntrees = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 0.31%
## Confusion matrix:
## Banana Grape Ground Nut Onion Sugar Cane Turmeric class.error
## Banana 14 0 0 0 0 0 0.00000000
## Grape 0 74 0 0 0 0 0.00000000
## Ground Nut 0 0 232 0 0 0 0.00000000
## Onion 0 1 0 16 0 0 0.05882353
## Sugar Cane 0 0 1 0 41 0 0.02380952
## Turmeric 0 0 0 0 0 265 0.00000000
boston_forest <- randomForest(Crop_Type ~ ., data = train, mtry = 17,
importance = TRUE, ntrees = 500)## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
boston_forest##
## Call:
## randomForest(formula = Crop_Type ~ ., data = train, mtry = 17, importance = TRUE, ntrees = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 0.31%
## Confusion matrix:
## Banana Grape Ground Nut Onion Sugar Cane Turmeric class.error
## Banana 14 0 0 0 0 0 0.00000000
## Grape 0 74 0 0 0 0 0.00000000
## Ground Nut 0 0 232 0 0 0 0.00000000
## Onion 0 1 0 16 0 0 0.05882353
## Sugar Cane 0 0 1 0 41 0 0.02380952
## Turmeric 0 0 0 0 0 265 0.00000000